Data preparation

Dataset - German Housing Data

Origin data set cleaned:

Variable rooms, bathrooms, bedrooms, floors, garages, Year_built, Year_renovated changed from decimal to integer Subset with buildings up to 20 rooms for our analysis created Rooms with .5 rounded up Levels of the variable Energy_source and Garagetype transformed.

Load of the cleaned dataset

data <- read.csv('german_housing_cleaned.csv',header =T, encoding='UTF-8')
head(data)
##     Price              Type Living_space Lot Rooms Bedrooms Bathrooms Floors
## 1  498000 Multiple dwelling       106.00 229     5        3         1      2
## 2  495000 Mid-terrace house       140.93 517     6        3         2     NA
## 3  749000         Farmhouse       162.89  82     5        3         2      4
## 4  259000         Farmhouse       140.00 814     4       NA         2      2
## 5  469000 Multiple dwelling       115.00 244     4        2         1     NA
## 6 1400000 Mid-terrace house       310.00 860     8       NA        NA      3
##   Year_built Furnishing_quality Year_renovated   Condition         Heating
## 1       2005             normal             NA refurbished central heating
## 2       1994              basic             NA refurbished   stove heating
## 3       2013                                NA dilapidated   stove heating
## 4       1900              basic           2000 fixer-upper central heating
## 5       1968            refined           2019 refurbished central heating
## 6       1969              basic             NA  maintained                
##      Energy_source Energy_efficiency_class             State              City
## 1      natural gas                       D Baden-Württemberg     Bodenseekreis
## 2                                          Baden-Württemberg  Konstanz (Kreis)
## 3 district heating                       B Baden-Württemberg Esslingen (Kreis)
## 4      electricity                       G Baden-Württemberg  Waldshut (Kreis)
## 5              oil                       F Baden-Württemberg Esslingen (Kreis)
## 6              oil                         Baden-Württemberg         Stuttgart
##                     Place Garages  Garagetype
## 1             Bermatingen       2 Parking lot
## 2                   Engen       7 Parking lot
## 3              Ostfildern       1      Garage
## 4 Bonndorf im Schwarzwald       1      Garage
## 5 Leinfelden-Echterdingen       1      Garage
## 6                     Süd       2      Garage

Inspection of all variables

str(data)
summary(data)
colnames(data)
apply(data,2,function(x) sum(is.na(x)))

Analysis of the distribution of the variables: ‘Price’, ‘Living_space’, ‘Rooms’ and ‘Lot’

par(mfrow = c(2,2))

#Price
plot(density(data$Price))

#Living_space
plot(density(data$Living_space))

#Rooms
hist(data$Rooms)

#Lot
plot(density(data$Lot))

Result: The variables are right skewed

Log Transformation of variables

Therefore we will use the Log Transformation of the variables ‘Price’, ‘Living_space’, ‘Rooms’, ‘Lot’ to get a nearly normal distribution

par(mfrow = c(2,2))

#Price
price.log <- density(log(data$Price))
plot(price.log)

#Living_space
living.log <- density(log(data$Living_space))
plot(living.log)

#Rooms
rooms.log <- log(data$Rooms)
hist(rooms.log)

#Lot
lot.log <- density(log(data$Lot))
plot(lot.log)

Add new columns with log transformed variables price, living space and rooms

data1 <- data
data1$log.price <- log(data1$Price)
data1$log.living <- log(data1$Living_space)
data1$log.rooms <- log(data1$Rooms)
data1$log.lot <- log(data1$Lot)
data1$Condition <- factor(data1$Condition)
str(data1)
## 'data.frame':    10318 obs. of  24 variables:
##  $ Price                  : num  498000 495000 749000 259000 469000 1400000 3500000 630000 364000 1750000 ...
##  $ Type                   : chr  "Multiple dwelling" "Mid-terrace house" "Farmhouse" "Farmhouse" ...
##  $ Living_space           : num  106 141 163 140 115 ...
##  $ Lot                    : num  229 517 82 814 244 860 5300 406 973 1460 ...
##  $ Rooms                  : int  5 6 5 4 4 8 13 10 10 6 ...
##  $ Bedrooms               : num  3 3 3 NA 2 NA NA NA 4 4 ...
##  $ Bathrooms              : num  1 2 2 2 1 NA 4 NA 4 2 ...
##  $ Floors                 : num  2 NA 4 2 NA 3 NA 3 2 3 ...
##  $ Year_built             : num  2005 1994 2013 1900 1968 ...
##  $ Furnishing_quality     : chr  "normal" "basic" "" "basic" ...
##  $ Year_renovated         : num  NA NA NA 2000 2019 ...
##  $ Condition              : Factor w/ 8 levels "","as new","dilapidated",..: 8 8 3 6 8 7 3 8 8 8 ...
##  $ Heating                : chr  "central heating" "stove heating" "stove heating" "central heating" ...
##  $ Energy_source          : chr  "natural gas" "" "district heating" "electricity" ...
##  $ Energy_efficiency_class: chr  "D" "" "B" "G" ...
##  $ State                  : chr  "Baden-Württemberg" "Baden-Württemberg" "Baden-Württemberg" "Baden-Württemberg" ...
##  $ City                   : chr  "Bodenseekreis" "Konstanz (Kreis)" "Esslingen (Kreis)" "Waldshut (Kreis)" ...
##  $ Place                  : chr  "Bermatingen" "Engen" "Ostfildern" "Bonndorf im Schwarzwald" ...
##  $ Garages                : num  2 7 1 1 1 2 7 2 8 2 ...
##  $ Garagetype             : chr  "Parking lot" "Parking lot" "Garage" "Garage" ...
##  $ log.price              : num  13.1 13.1 13.5 12.5 13.1 ...
##  $ log.living             : num  4.66 4.95 5.09 4.94 4.74 ...
##  $ log.rooms              : num  1.61 1.79 1.61 1.39 1.39 ...
##  $ log.lot                : num  5.43 6.25 4.41 6.7 5.5 ...

1 Week 1 - Linear Models

1.1 Data Visualisation and Linear regressions

options(scipen=999) #block scientific notation
library(ggplot2)
attach(data)

1.2 Scatterplot with regression line for log(Price) against log(Living_space)

We plot the response variable “Price” against the predictor “Living_Space” to get a first impression and grahical analysis.

#Living_space
ggplot(data1, aes(log.living, log.price)) + geom_point() + geom_smooth(method = lm, se = T, color = 'red') + ggtitle('Scatterplot with regression line for log(Price) against log.living')

1.3 Fitting a Simple Linear regression of log(Price) against log(Living_space) and check the coefficients

#linear model
lm.log.price_living <- lm(log.price ~ log.living, data = data1)
summary(lm.log.price_living)
## 
## Call:
## lm(formula = log.price ~ log.living, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7836 -0.4079  0.0856  0.4683  2.7581 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  8.17522    0.07596  107.62 <0.0000000000000002 ***
## log.living   0.90280    0.01455   62.05 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7145 on 10316 degrees of freedom
## Multiple R-squared:  0.2718, Adjusted R-squared:  0.2717 
## F-statistic:  3850 on 1 and 10316 DF,  p-value: < 0.00000000000000022
#estimated regression coefficients
coef(lm.log.price_living)
## (Intercept)  log.living 
##   8.1752232   0.9028032

1.4 Linear regression of log(Price) against log(Living_space) including the Type and finding the intercept for the different Types.

##linear model
lm.log.price_living_type <- lm(data = data1, log.price ~ log.living + Type)
summary(lm.log.price_living_type)
## 
## Call:
## lm(formula = log.price ~ log.living + Type, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9802 -0.3849  0.0716  0.4556  2.7356 
## 
## Coefficients:
##                          Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)               7.47225    0.09409  79.414 < 0.0000000000000002 ***
## log.living                0.99691    0.01679  59.378 < 0.0000000000000002 ***
## TypeBungalow              0.24691    0.05737   4.304    0.000016956457415 ***
## TypeCastle               -0.33356    0.31169  -1.070             0.284573    
## TypeCorner house         -0.14759    0.06058  -2.436             0.014857 *  
## TypeDuplex               -0.06320    0.03900  -1.620             0.105186    
## TypeFarmhouse             0.26026    0.04599   5.659    0.000000015603919 ***
## TypeMid-terrace house     0.24470    0.03679   6.651    0.000000000030494 ***
## TypeMultiple dwelling     0.35983    0.05029   7.155    0.000000000000891 ***
## TypeResidential property  0.17411    0.05094   3.418             0.000634 ***
## TypeSingle dwelling       0.39721    0.04091   9.708 < 0.0000000000000002 ***
## TypeSpecial property      0.46909    0.05103   9.193 < 0.0000000000000002 ***
## TypeVilla                 0.70309    0.05077  13.847 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6912 on 10305 degrees of freedom
## Multiple R-squared:  0.3193, Adjusted R-squared:  0.3185 
## F-statistic: 402.8 on 12 and 10305 DF,  p-value: < 0.00000000000000022
#estimated regression coefficients
coef(lm.log.price_living_type)
##              (Intercept)               log.living             TypeBungalow 
##                7.4722543                0.9969080                0.2469139 
##               TypeCastle         TypeCorner house               TypeDuplex 
##               -0.3335592               -0.1475928               -0.0631954 
##            TypeFarmhouse    TypeMid-terrace house    TypeMultiple dwelling 
##                0.2602565                0.2447012                0.3598267 
## TypeResidential property      TypeSingle dwelling     TypeSpecial property 
##                0.1741124                0.3972086                0.4690858 
##                TypeVilla 
##                0.7030881
#intercept of Type "NULL"
coef(lm.log.price_living_type)['(Intercept)']
## (Intercept) 
##    7.472254
#intercept of Type single dwelling
coef(lm.log.price_living_type)['(Intercept)'] + coef(lm.log.price_living_type)['TypeSingle dwelling']
## (Intercept) 
##    7.869463

1.5 Linear regression of log(Price) against log(Living_space) including the Type interaction

##linear model
lm.log.price_living_type2 <- lm(data = data1, log.price ~ log.living * Type)
summary(lm.log.price_living_type2)
## 
## Call:
## lm(formula = log.price ~ log.living * Type, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0574 -0.3835  0.0672  0.4576  2.7139 
## 
## Coefficients:
##                                     Estimate Std. Error t value
## (Intercept)                          7.73609    0.27923  27.705
## log.living                           0.94614    0.05331  17.749
## TypeBungalow                         0.92173    0.48628   1.895
## TypeCastle                          -1.04467    5.02138  -0.208
## TypeCorner house                    -0.80053    0.58144  -1.377
## TypeDuplex                           1.34421    0.35134   3.826
## TypeFarmhouse                        0.63145    0.53470   1.181
## TypeMid-terrace house               -0.68139    0.31354  -2.173
## TypeMultiple dwelling                0.24793    0.69059   0.359
## TypeResidential property            -0.31372    0.45320  -0.692
## TypeSingle dwelling                 -0.75763    0.42213  -1.795
## TypeSpecial property                -0.77518    0.45681  -1.697
## TypeVilla                            0.06219    0.60861   0.102
## log.living:TypeBungalow             -0.11718    0.08861  -1.322
## log.living:TypeCastle                0.12151    0.79328   0.153
## log.living:TypeCorner house          0.12396    0.10936   1.133
## log.living:TypeDuplex               -0.24967    0.06560  -3.806
## log.living:TypeFarmhouse            -0.08080    0.10831  -0.746
## log.living:TypeMid-terrace house     0.18009    0.06011   2.996
## log.living:TypeMultiple dwelling     0.01967    0.13964   0.141
## log.living:TypeResidential property  0.09284    0.08538   1.087
## log.living:TypeSingle dwelling       0.23218    0.08354   2.779
## log.living:TypeSpecial property      0.25164    0.09095   2.767
## log.living:TypeVilla                 0.11600    0.10750   1.079
##                                                 Pr(>|t|)    
## (Intercept)                         < 0.0000000000000002 ***
## log.living                          < 0.0000000000000002 ***
## TypeBungalow                                    0.058061 .  
## TypeCastle                                      0.835198    
## TypeCorner house                                0.168607    
## TypeDuplex                                      0.000131 ***
## TypeFarmhouse                                   0.237658    
## TypeMid-terrace house                           0.029787 *  
## TypeMultiple dwelling                           0.719593    
## TypeResidential property                        0.488813    
## TypeSingle dwelling                             0.072720 .  
## TypeSpecial property                            0.089737 .  
## TypeVilla                                       0.918609    
## log.living:TypeBungalow                         0.186066    
## log.living:TypeCastle                           0.878261    
## log.living:TypeCorner house                     0.257038    
## log.living:TypeDuplex                           0.000142 ***
## log.living:TypeFarmhouse                        0.455705    
## log.living:TypeMid-terrace house                0.002741 ** 
## log.living:TypeMultiple dwelling                0.887975    
## log.living:TypeResidential property             0.276883    
## log.living:TypeSingle dwelling                  0.005460 ** 
## log.living:TypeSpecial property                 0.005673 ** 
## log.living:TypeVilla                            0.280593    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6879 on 10294 degrees of freedom
## Multiple R-squared:  0.3264, Adjusted R-squared:  0.3249 
## F-statistic: 216.9 on 23 and 10294 DF,  p-value: < 0.00000000000000022
#estimated regression coefficients
coef(lm.log.price_living_type2)
##                         (Intercept)                          log.living 
##                          7.73608549                          0.94613962 
##                        TypeBungalow                          TypeCastle 
##                          0.92172518                         -1.04467031 
##                    TypeCorner house                          TypeDuplex 
##                         -0.80052753                          1.34420839 
##                       TypeFarmhouse               TypeMid-terrace house 
##                          0.63144756                         -0.68139164 
##               TypeMultiple dwelling            TypeResidential property 
##                          0.24793247                         -0.31371529 
##                 TypeSingle dwelling                TypeSpecial property 
##                         -0.75763067                         -0.77518375 
##                           TypeVilla             log.living:TypeBungalow 
##                          0.06219264                         -0.11718366 
##               log.living:TypeCastle         log.living:TypeCorner house 
##                          0.12151270                          0.12396267 
##               log.living:TypeDuplex            log.living:TypeFarmhouse 
##                         -0.24967456                         -0.08079852 
##    log.living:TypeMid-terrace house    log.living:TypeMultiple dwelling 
##                          0.18008764                          0.01967192 
## log.living:TypeResidential property      log.living:TypeSingle dwelling 
##                          0.09283769                          0.23217929 
##     log.living:TypeSpecial property                log.living:TypeVilla 
##                          0.25164379                          0.11599769
# The "P-Values - confidence intervals" duality
confint(lm.log.price_living_type2)
##                                            2.5 %      97.5 %
## (Intercept)                           7.18873463  8.28343635
## log.living                            0.84164786  1.05063138
## TypeBungalow                         -0.03148560  1.87493595
## TypeCastle                          -10.88754235  8.79820173
## TypeCorner house                     -1.94027217  0.33921710
## TypeDuplex                            0.65550987  2.03290692
## TypeFarmhouse                        -0.41667545  1.67957057
## TypeMid-terrace house                -1.29599299 -0.06679028
## TypeMultiple dwelling                -1.10576712  1.60163207
## TypeResidential property             -1.20207438  0.57464379
## TypeSingle dwelling                  -1.58509414  0.06983281
## TypeSpecial property                 -1.67062159  0.12025410
## TypeVilla                            -1.13079934  1.25518461
## log.living:TypeBungalow              -0.29088557  0.05651824
## log.living:TypeCastle                -1.43346849  1.67649390
## log.living:TypeCorner house          -0.09041307  0.33833842
## log.living:TypeDuplex                -0.37826357 -0.12108555
## log.living:TypeFarmhouse             -0.29311479  0.13151774
## log.living:TypeMid-terrace house      0.06226578  0.29790949
## log.living:TypeMultiple dwelling     -0.25405926  0.29340310
## log.living:TypeResidential property  -0.07451478  0.26019016
## log.living:TypeSingle dwelling        0.06841797  0.39594061
## log.living:TypeSpecial property       0.07335498  0.42993261
## log.living:TypeVilla                 -0.09472399  0.32671936

1.6 Measures of fit

formula(lm.log.price_living_type)
## log.price ~ log.living + Type
summary(lm.log.price_living_type)$r.squared
## [1] 0.3192727
summary(lm.log.price_living_type)$adj.r.squared
## [1] 0.31848
formula(lm.log.price_living_type2)
## log.price ~ log.living * Type
summary(lm.log.price_living_type2)$r.squared
## [1] 0.3263949
summary(lm.log.price_living_type2)$adj.r.squared
## [1] 0.3248899

1.7 Fitted values

The function fitted() can be used to extract the predicted values for the existing observations

attach(data)
## The following objects are masked from data (pos = 3):
## 
##     Bathrooms, Bedrooms, City, Condition, Energy_efficiency_class,
##     Energy_source, Floors, Furnishing_quality, Garages, Garagetype,
##     Heating, Living_space, Lot, Place, Price, Rooms, State, Type,
##     Year_built, Year_renovated
#lm.log.price_living
fitted.price_living <- fitted(lm.log.price_living)
str(fitted.price_living)
##  Named num [1:10318] 12.4 12.6 12.8 12.6 12.5 ...
##  - attr(*, "names")= chr [1:10318] "1" "2" "3" "4" ...
head(fitted.price_living)
##        1        2        3        4        5        6 
## 12.38539 12.64253 12.77327 12.63655 12.45896 13.35422
plot(log(Price)~ log(Living_space), main = 'Model log(Price) ~ log(Living_space)', col = 'navy', pch = 16)
points(fitted.price_living ~ log(Living_space), col = 'red', pch = 16)
abline(lm.log.price_living, col = 'yellow', lwd = 2.5)

1.8 Residuals of model log(Price) ~ log(Living_space)

attach(data1)
resid.price_living <- resid(lm.log.price_living)
length(resid.price_living)
## [1] 10318
head(resid.price_living)
##          1          2          3          4          5          6 
##  0.7329643  0.4697818  0.7532264 -0.1719706  0.5993948  0.7977636
set.seed(100)
id <- sample(x = 1:10318, size = 5)
resid.price_living[id]
##       3786        503       3430       3696       4090 
##  0.5106768 -0.3315001  0.4470366 -0.9261093  1.0834033
fitted.price_living[id]
##     3786      503     3430     3696     4090 
## 13.77484 12.69884 13.46378 12.41883 13.03221
plot(log(Price) ~ log(Living_space), main = 'Model log(Price) ~ log(Living_space)', col = 'navy', pch = 16)
abline(lm.log.price_living, col = 'green', lwd = 2.5)

points(log(Price) ~ log(Living_space), data = data1[id, ], col = 'red', pch = 4, lwd = 5)
segments(x0 = data1[id, 'log.living'], x1 = data1[id, 'log.living'],
         y0 = fitted.price_living[id], y1 = data1[id, 'log.price'], col = 'yellow', lwd = 2)

## Predicting values using splitted data set 80:20 ratio

#split dataset 
split80 <- round(nrow(data1)* 0.80)
train <- data1[1:split80,]
test <- data1[(split80 + 1):nrow(data1),]
dim(train)
## [1] 8254   24
dim(test)
## [1] 2064   24
#linear regression model
lm.train <- lm(log.price ~ log.living, data = train)
summary(lm.train)
## 
## Call:
## lm(formula = log.price ~ log.living, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8765 -0.3774  0.0557  0.4283  2.6734 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  8.45284    0.07981  105.91 <0.0000000000000002 ***
## log.living   0.86819    0.01524   56.96 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6563 on 8252 degrees of freedom
## Multiple R-squared:  0.2822, Adjusted R-squared:  0.2821 
## F-statistic:  3245 on 1 and 8252 DF,  p-value: < 0.00000000000000022
#predictions
pred.new.living <- predict(object = lm.train, newdata = test)
pred.new.living.CI <- predict(object = lm.train, interval = 'prediction', newdata = test)

#display predictions
plot(log.price ~log.living, data = data1, main = 'Prediction with Model log(Price) ~ log(Living_space)', col = 'navy', pch = 16)
points(x = test$log.living, y= pred.new.living, col = 'red', pch = 16, cex = 1.5)
abline(lm.train, col = 'yellow', lwd = 2.5)

plot(log.price ~ log.living, data = train, main = 'Prediction with Model log(Price) ~ log(Living_space)', col = 'navy', pch = 16)
segments(x0 = test$log.living, x1 = test$log.living,
         y0 = pred.new.living.CI[, 'lwr'], y1 = pred.new.living.CI[, 'upr'], lwd = 2, col = 'green')
points(x = test$log.living, y= pred.new.living.CI[,'fit'], col = 'red', pch = 16, cex =1.5)
abline(lm.train, col = 'yellow', lwd = 2.5)

1.9 Testing the effect of a categorical variable and post-hoc contrasts

condition.box.with_outlier <- ggplot(data1, aes(x=Condition, y=log.price)) + geom_boxplot(outlier.colour = 'red')+ theme(axis.text.x = element_text(angle = 90)) + ggtitle('Boxplots of log(Price) against Condition with outliers in red')
plot(condition.box.with_outlier)

#model
lm.price_condition.1 <- lm(log.price ~ Condition, data = data1)
summary(lm.price_condition.1)
## 
## Call:
## lm(formula = log.price ~ Condition, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6043 -0.4304  0.0333  0.4884  3.6247 
## 
## Coefficients:
##                                               Estimate Std. Error t value
## (Intercept)                                   12.95695    0.04213 307.553
## Conditionas new                               -1.01885    0.24696  -4.126
## Conditiondilapidated                           0.47108    0.04838   9.738
## Conditionfirst occupation                      0.19331    0.09330   2.072
## Conditionfirst occupation after refurbishment -0.05091    0.05685  -0.895
## Conditionfixer-upper                          -0.10881    0.05398  -2.016
## Conditionmaintained                           -0.42435    0.04906  -8.649
## Conditionrefurbished                          -0.14229    0.04328  -3.288
##                                                           Pr(>|t|)    
## (Intercept)                                   < 0.0000000000000002 ***
## Conditionas new                                          0.0000373 ***
## Conditiondilapidated                          < 0.0000000000000002 ***
## Conditionfirst occupation                                  0.03829 *  
## Conditionfirst occupation after refurbishment              0.37055    
## Conditionfixer-upper                                       0.04384 *  
## Conditionmaintained                           < 0.0000000000000002 ***
## Conditionrefurbished                                       0.00101 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8071 on 10310 degrees of freedom
## Multiple R-squared:  0.07146,    Adjusted R-squared:  0.07083 
## F-statistic: 113.4 on 7 and 10310 DF,  p-value: < 0.00000000000000022
#coefficients
coef(lm.price_condition.1)
##                                   (Intercept) 
##                                   12.95694626 
##                               Conditionas new 
##                                   -1.01884973 
##                          Conditiondilapidated 
##                                    0.47108462 
##                     Conditionfirst occupation 
##                                    0.19330741 
## Conditionfirst occupation after refurbishment 
##                                   -0.05090966 
##                          Conditionfixer-upper 
##                                   -0.10881029 
##                           Conditionmaintained 
##                                   -0.42434950 
##                          Conditionrefurbished 
##                                   -0.14228677
aggregate(log.price ~Condition, 
          FUN = mean, data = data1)
##                              Condition log.price
## 1                                       12.95695
## 2                               as new  11.93810
## 3                          dilapidated  13.42803
## 4                     first occupation  13.15025
## 5 first occupation after refurbishment  12.90604
## 6                          fixer-upper  12.84814
## 7                           maintained  12.53260
## 8                          refurbished  12.81466
#model without slope, only intercept
lm.price_condition.0 <- lm(log.price ~ 1, data = data1)
summary(lm.price_condition.0)
## 
## Call:
## lm(formula = log.price ~ 1, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6576 -0.4388  0.0287  0.5166  3.5125 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 12.867983   0.008243    1561 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8373 on 10317 degrees of freedom
coef(lm.price_condition.0)
## (Intercept) 
##    12.86798
#Anova
anova(lm.price_condition.0, lm.price_condition.1)
## Analysis of Variance Table
## 
## Model 1: log.price ~ 1
## Model 2: log.price ~ Condition
##   Res.Df    RSS Df Sum of Sq      F                Pr(>F)    
## 1  10317 7232.5                                              
## 2  10310 6715.7  7    516.86 113.36 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#post-hoc contrasts
library(multcomp)
unique(data1$Condition)
## [1] refurbished                          dilapidated                         
## [3] fixer-upper                          maintained                          
## [5] as new                                                                   
## [7] first occupation after refurbishment first occupation                    
## 8 Levels:  as new dilapidated ... refurbished
ph.test.1 <- glht(model = lm.price_condition.1, linfct = mcp(Condition = c('refurbished - dilapidated = 0')))
summary(ph.test.1)
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: User-defined Contrasts
## 
## 
## Fit: lm(formula = log.price ~ Condition, data = data1)
## 
## Linear Hypotheses:
##                                Estimate Std. Error t value            Pr(>|t|)
## refurbished - dilapidated == 0 -0.61337    0.02576  -23.81 <0.0000000000000002
##                                   
## refurbished - dilapidated == 0 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

R uses “treatment contrasts” and therefore the Intercept refers to the first in alphabetical order, here “Null”. The other coefficients represent the difference.

1.10 Adding more categorical variables to the testing above

#str(data1)
Year_built1 <- as.integer(data1$Year_built)
floor1 <- as.integer(data1$Floors)
typeof(Year_built1)
## [1] "integer"
lm.price_condition.2 <- update(lm.price_condition.1,. ~ . + Type + log.rooms + State +Energy_efficiency_class + Year_built1 + Furnishing_quality + floor1)
formula(lm.price_condition.2)
## log.price ~ Condition + Type + log.rooms + State + Energy_efficiency_class + 
##     Year_built1 + Furnishing_quality + floor1
drop1(lm.price_condition.2, test = "F")
## Single term deletions
## 
## Model:
## log.price ~ Condition + Type + log.rooms + State + Energy_efficiency_class + 
##     Year_built1 + Furnishing_quality + floor1
##                         Df Sum of Sq    RSS     AIC  F value
## <none>                               2148.4 -8631.8         
## Condition                7     17.57 2165.9 -8587.1   8.3684
## Type                    11    134.63 2283.0 -8215.5  40.7950
## log.rooms                1    152.62 2301.0 -8138.9 508.7221
## State                   15    638.74 2787.1 -6784.8 141.9381
## Energy_efficiency_class  9     33.56 2181.9 -8538.0  12.4294
## Year_built1              1     56.88 2205.2 -8445.4 189.5786
## Furnishing_quality       4    346.08 2494.4 -7562.8 288.3902
## floor1                   1     31.86 2180.2 -8527.7 106.1896
##                                        Pr(>F)    
## <none>                                           
## Condition                     0.0000000003205 ***
## Type                    < 0.00000000000000022 ***
## log.rooms               < 0.00000000000000022 ***
## State                   < 0.00000000000000022 ***
## Energy_efficiency_class < 0.00000000000000022 ***
## Year_built1             < 0.00000000000000022 ***
## Furnishing_quality      < 0.00000000000000022 ***
## floor1                  < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2 Week 2 - Non-linearity

2.1 Polynomials

By including polynomials (e.g. x1 + x1^2) we can model non linear relationships with a Linear Model.

library(ggplot2)
attach(data1)

Graphical analysis

log(Price) ~ log(Living_space)

gg.log.price_log.living <- ggplot(data1,mapping = aes(y = log.price, x = log.living)) + geom_point()
gg.log.price_log.living + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

gg.log.price_log.living <- ggplot(data1,mapping = aes(y = log.price, x = Year_built1)) + geom_point()
gg.log.price_log.living + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 666 rows containing non-finite values (stat_smooth).
## Warning: Removed 666 rows containing missing values (geom_point).

gg.log.price_log.living <- ggplot(data1,mapping = aes(y = log.price, x = log.living, colour = log.rooms)) + geom_point()
gg.log.price_log.living + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Quadratic Effects

#model with a linear effect for log.living
lm.living.1 <- lm(log.price ~ log.living + Year_built1)
drop1(lm.living.1, test = "F")
## Single term deletions
## 
## Model:
## log.price ~ log.living + Year_built1
##             Df Sum of Sq    RSS     AIC F value                Pr(>F)    
## <none>                   4106.2 -8243.4                                  
## log.living   1   1938.49 6044.6 -4513.1  4555.2 < 0.00000000000000022 ***
## Year_built1  1    655.04 4761.2 -6816.8  1539.3 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#model with a quadratic effect for log.living
lm.living.2 <- update(lm.living.1, . ~ . + I(log.living^2))
summary(lm.living.2)
## 
## Call:
## lm(formula = log.price ~ log.living + Year_built1 + I(log.living^2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6813 -0.3840  0.0215  0.4053  4.1527 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)     -3.8930634  0.5350769  -7.276    0.000000000000371 ***
## log.living       1.9999960  0.1905934  10.494 < 0.0000000000000002 ***
## Year_built1      0.0046546  0.0001207  38.553 < 0.0000000000000002 ***
## I(log.living^2) -0.1004879  0.0180993  -5.552    0.000000028983449 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6513 on 9648 degrees of freedom
##   (666 observations deleted due to missingness)
## Multiple R-squared:  0.3711, Adjusted R-squared:  0.3709 
## F-statistic:  1897 on 3 and 9648 DF,  p-value: < 0.00000000000000022
#test in quadratic
anova(lm.living.1, lm.living.2)
## Analysis of Variance Table
## 
## Model 1: log.price ~ log.living + Year_built1
## Model 2: log.price ~ log.living + Year_built1 + I(log.living^2)
##   Res.Df    RSS Df Sum of Sq      F        Pr(>F)    
## 1   9649 4106.2                                      
## 2   9648 4093.1  1    13.077 30.825 0.00000002898 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Second Model has lower RSS Value and therefore slightly better

#plot
gg.log.price_log.living + geom_smooth(method = 'lm', formula = y ~poly(x, degree = 2))

#model with a quadratic poly
lm.living.3 <- lm(log.price ~ log.rooms + poly(log.living, degree = 2))
summary(lm.living.3)
## 
## Call:
## lm(formula = log.price ~ log.rooms + poly(log.living, degree = 2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9761 -0.3927  0.0714  0.4552  2.6652 
## 
## Coefficients:
##                               Estimate Std. Error t value            Pr(>|t|)
## (Intercept)                   13.68390    0.04616  296.42 <0.0000000000000002
## log.rooms                     -0.44504    0.02490  -17.88 <0.0000000000000002
## poly(log.living, degree = 2)1 59.73564    1.11071   53.78 <0.0000000000000002
## poly(log.living, degree = 2)2 -7.82809    0.70453  -11.11 <0.0000000000000002
##                                  
## (Intercept)                   ***
## log.rooms                     ***
## poly(log.living, degree = 2)1 ***
## poly(log.living, degree = 2)2 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7009 on 10314 degrees of freedom
## Multiple R-squared:  0.2994, Adjusted R-squared:  0.2992 
## F-statistic:  1469 on 3 and 10314 DF,  p-value: < 0.00000000000000022
#model with a cubic poly
lm.living.4 <- lm(log.price ~ log.rooms + poly(log.living, degree = 3))
summary(lm.living.4)
## 
## Call:
## lm(formula = log.price ~ log.rooms + poly(log.living, degree = 3))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9763 -0.3928  0.0714  0.4553  2.6650 
## 
## Coefficients:
##                               Estimate Std. Error t value            Pr(>|t|)
## (Intercept)                   13.68416    0.04669 293.108 <0.0000000000000002
## log.rooms                     -0.44519    0.02519 -17.676 <0.0000000000000002
## poly(log.living, degree = 3)1 59.74067    1.11848  53.412 <0.0000000000000002
## poly(log.living, degree = 3)2 -7.82850    0.70465 -11.110 <0.0000000000000002
## poly(log.living, degree = 3)3 -0.02715    0.70903  -0.038               0.969
##                                  
## (Intercept)                   ***
## log.rooms                     ***
## poly(log.living, degree = 3)1 ***
## poly(log.living, degree = 3)2 ***
## poly(log.living, degree = 3)3    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.701 on 10313 degrees of freedom
## Multiple R-squared:  0.2994, Adjusted R-squared:  0.2991 
## F-statistic:  1102 on 4 and 10313 DF,  p-value: < 0.00000000000000022
gg.log.lot.log.price <- ggplot(data = data1, mapping = aes(y = log.lot, x = log.price)) + geom_point()
gg.log.lot.log.price + geom_smooth(method = 'gam')

2.2 Regression Splines

library(splines)
lm.regression_splines <- lm(log.price ~ bs(log.living, df = 3))
summary(lm.regression_splines)
## 
## Call:
## lm(formula = log.price ~ bs(log.living, df = 3))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8153 -0.4074  0.0784  0.4607  2.7500 
## 
## Coefficients:
##                         Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)               9.2652     0.2111  43.897 <0.0000000000000002 ***
## bs(log.living, df = 3)1   3.9273     0.4428   8.869 <0.0000000000000002 ***
## bs(log.living, df = 3)2   4.1135     0.1701  24.182 <0.0000000000000002 ***
## bs(log.living, df = 3)3   5.4793     0.4107  13.342 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7115 on 10314 degrees of freedom
## Multiple R-squared:  0.2782, Adjusted R-squared:  0.278 
## F-statistic:  1325 on 3 and 10314 DF,  p-value: < 0.00000000000000022

2.3 Generalised Additive Models - GAMs

library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.

GAMs for log(Price) ~ s(Rooms)

attach(data1)
## The following objects are masked from data1 (pos = 6):
## 
##     Bathrooms, Bedrooms, City, Condition, Energy_efficiency_class,
##     Energy_source, Floors, Furnishing_quality, Garages, Garagetype,
##     Heating, Living_space, log.living, log.lot, log.price, log.rooms,
##     Lot, Place, Price, Rooms, State, Type, Year_built, Year_renovated
## The following objects are masked from data1 (pos = 12):
## 
##     Bathrooms, Bedrooms, City, Condition, Energy_efficiency_class,
##     Energy_source, Floors, Furnishing_quality, Garages, Garagetype,
##     Heating, Living_space, log.living, log.lot, log.price, log.rooms,
##     Lot, Place, Price, Rooms, State, Type, Year_built, Year_renovated
## The following objects are masked from data (pos = 13):
## 
##     Bathrooms, Bedrooms, City, Condition, Energy_efficiency_class,
##     Energy_source, Floors, Furnishing_quality, Garages, Garagetype,
##     Heating, Living_space, Lot, Place, Price, Rooms, State, Type,
##     Year_built, Year_renovated
## The following objects are masked from data (pos = 14):
## 
##     Bathrooms, Bedrooms, City, Condition, Energy_efficiency_class,
##     Energy_source, Floors, Furnishing_quality, Garages, Garagetype,
##     Heating, Living_space, Lot, Place, Price, Rooms, State, Type,
##     Year_built, Year_renovated
gam.log.price.rooms <- gam(log.price ~ s(log.rooms))
summary(gam.log.price.rooms)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log.price ~ s(log.rooms)
## 
## Parametric coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 12.867983   0.007783    1653 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F             p-value    
## s(log.rooms) 7.888  8.623 145.8 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.108   Deviance explained = 10.9%
## GCV = 0.62562  Scale est. = 0.62509   n = 10318
plot(gam.log.price.rooms, residuals = TRUE, cex = 2)

GAMs for log(Price) ~ log(Living_space) + s(Rooms) + s(Garages)

gam.log.price.log.living <- gam(log.price ~ log.living + s(log.rooms) + s(log(Garages)))
summary(gam.log.price.log.living)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log.price ~ log.living + s(log.rooms) + s(log(Garages))
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  6.46447    0.13313   48.56 <0.0000000000000002 ***
## log.living   1.24132    0.02547   48.74 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df      F              p-value    
## s(log.rooms)    4.964  6.084 67.976 < 0.0000000000000002 ***
## s(log(Garages)) 8.215  8.681  3.852            0.0000713 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.305   Deviance explained = 30.6%
## GCV = 0.42626  Scale est. = 0.4255    n = 8437
plot(gam.log.price.log.living, residuals = TRUE, select = 1)

#stroe the residuals in data1
data1$resid.price_living <- resid(lm.log.price_living)

#creat QQ-Plot

qqnorm(resid(lm.log.price_living))
qqline(resid(lm.log.price_living))

Expected value of the errors is “on zero”

ggplot(mapping = aes(y = resid(lm.log.price_living),
                     x = fitted(lm.log.price_living))) +
  geom_abline(intercept = 0, slope = 0) + geom_point() +
  geom_smooth()

probably non-linear because the smoother is not on zero

Homoscedasticity

ggplot(mapping = aes(y = abs(resid(lm.log.price_living)), x = fitted(lm.log.price_living))) +
  geom_abline(intercept = 0, slope = 0) + geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

The variance of the residuals seems to be fairly constant

Cooks distance

## 1) compute Cook’s distance 

cooks.dist.log.price_living <- cooks.distance(lm.log.price_living) 

##
## 2) plot it

ggplot(data = data1,
       mapping = aes(y = log.price, x = log.living,
                     colour = cooks.dist.log.price_living)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) + facet_wrap(. ~ Type)
## `geom_smooth()` using formula 'y ~ x'

plot(lm.log.price_living, which = 5)

Testing the model assumptions

plot(lm.log.price_living_type)

Bootstrap

#mean(data$Price)
##
B <- 10^3
t.mean <- c()

for(i in 1:B){
  t.id <- sample(1:10318, replace = TRUE)
  t.data.price <- data$Price[t.id]
  t.mean[i] <- mean(t.data.price)
}


##
length(t.mean)
## [1] 1000
hist(t.mean, breaks = 50)
abline(v = mean(data$Price), col = "red")

sorted.means <- sort(t.mean) 
quantile(sorted.means, probs = c(0.025, 0.975))
##     2.5%    97.5% 
## 529609.2 551910.5